# improt the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from tqdm.auto import tqdm
from tabulate import tabulate
from sklearn.preprocessing import LabelEncoder
pd.set_option('display.max_columns', None)
# Ignore warning
import warnings
warnings.filterwarnings('ignore')
# Dataset URL
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt"
# Define column names for the dataset
column_names = ["Area", "Perimeter", "Compactness", "Length of Kernel", "Width of Kernel", "Asymmetry Coefficient", "Length of Kernel Groove", "Class"]
# Read the dataset into a Pandas DataFrame
df = pd.read_csv(url, delim_whitespace=True, names=column_names)
df_seed = df.copy()
df_seed.head()
# URL of the dataset on UCI Machine Learning Repository
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
# Define column names for the dataset
column_names = [
"ID", "Diagnosis", "Mean Radius", "Mean Texture", "Mean Perimeter", "Mean Area", "Mean Smoothness",
"Mean Compactness", "Mean Concavity", "Mean Concave Points", "Mean Symmetry", "Mean Fractal Dimension",
"SE Radius", "SE Texture", "SE Perimeter", "SE Area", "SE Smoothness", "SE Compactness", "SE Concavity",
"SE Concave Points", "SE Symmetry", "SE Fractal Dimension", "Worst Radius", "Worst Texture", "Worst Perimeter",
"Worst Area", "Worst Smoothness", "Worst Compactness", "Worst Concavity", "Worst Concave Points",
"Worst Symmetry", "Worst Fractal Dimension"
]
# Load the dataset into a Pandas DataFrame
df = pd.read_csv(url, header=None, names=column_names)
# remove the ID feature that is irrelevent
df = df.drop('ID', axis = 1)
# lets convert the class feature into numeric form
df['Diagnosis'] = df['Diagnosis'].map({'B':0, 'M':1})
df_wdbc = df.copy()
df_wdbc.head()
def check_dataset_shape(df, dataset_name):
print(f'Total Rows in ({dataset_name}): ', df.shape[0])
print(f'Total Columns in ({dataset_name}): ', df.shape[1])
print()
check_dataset_shape(df_seed, "Seeds Dataset")
check_dataset_shape(df_wdbc, "Breast Cancer Wisconsin DataSet")
plt.figure(figsize=(13, 5))
plt.subplot(1, 2, 1)
df_wdbc['Diagnosis'].map({0:'B', 1:'M'}).value_counts().plot.pie(explode=[0.02,0.02],autopct='%1.2f%%',shadow=True, colors = ['green', 'red'],
textprops={'fontsize':15, 'color':'white'})
plt.ylabel('Diagnosis', fontsize=12)
plt.title(f'Frequency Distribution of\nWdbc Dataset', fontsize=14)
plt.legend()
plt.subplot(1,2,2)
# df_heart['Class'].map({0:'Died', 1:'Alive'}).value_counts().plot.pie(explode=[0.02,0.02],autopct='%1.2f%%',shadow=True, colors = ['green', 'red'],
# textprops={'fontsize':15, 'color':'white'})
df_seed['Class'].value_counts().plot.pie(explode=[0.02,0.02, 0.0],autopct='%1.2f%%',shadow=True, colors = ['green', 'red', 'blue'],
textprops={'fontsize':15, 'color':'white'})
plt.ylabel('Death Event', fontsize=12)
plt.title('Frequency Distribution of\nSeeds Dataset', fontsize=14)
plt.legend()
plt.show()
# !pip install yellowbrick
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
# METABRIC Dataset
X_seed = df_seed.drop('Class', axis = 1)
y_seed = df_seed['Class']
# WDBC Dataset
X_wdbc = df_wdbc.drop('Diagnosis', axis = 1)
y_wdbc = df_wdbc['Diagnosis']
# Normalize the dataset
std = StandardScaler()
X_wdbc_std = std.fit_transform(X_wdbc)
df_wdbc_std = pd.DataFrame(X_wdbc_std, columns=X_wdbc.columns)
df_wdbc_std_org = df_wdbc_std.copy()
std1 = StandardScaler()
X_seed_std = std1.fit_transform(X_seed)
df_seed_std= pd.DataFrame(X_seed_std, columns=X_seed.columns)
df_seed_std_org = df_seed_std.copy()
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import mutual_info_score, davies_bouldin_score
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples, adjusted_mutual_info_score
import time
def runKMeans(data, y, dataset_name):
dataframe = []
wcss_list = [] # Initializing the list for the values of WCSS
SC = []
DBS = []
ARS = []
AMIS = []
time__ = []
c = range(2, 10) # iterate on each cluster
fig, axes = plt.subplots(1, 2, figsize=(18, 8)) # Two plots in each row
for i in c:
start = time.time()
kmeans = KMeans(n_clusters=i, random_state=42, init='k-means++')
kmeans.fit(data)
end = time.time() - start
time__.append(end)
wcss_list.append(kmeans.inertia_)
sc__ = silhouette_score(data, kmeans.labels_, metric='euclidean')
SC.append(sc__)
dbs__ = davies_bouldin_score(data, kmeans.labels_)
DBS.append(dbs__)
ars__ = adjusted_rand_score(y, kmeans.labels_)
amis__ = adjusted_mutual_info_score(y, kmeans.labels_)
AMIS.append(amis__)
ARS.append(ars__)
dataframe.append({
'Clustering': 'KMeans',
'Dataset': dataset_name,
'Clusters': i,
'Elbow (WSSE)': kmeans.inertia_,
'Silhouette Coefficient': sc__,
'Davies-Bouldin Score': dbs__,
'Adjusted Rand Score': ars__,
'Adjusted Mutual Info Score': amis__,
'Time Taken': end
})
# Elbow plot
ax1 = axes[0]
ax1.plot(c, wcss_list, marker='o')
ax1.set_title(f'KMeans - Elbow Method ({dataset_name} Dataset)', fontsize=14)
ax1.set_xlabel('Number of clusters (k)', fontsize=14)
ax1.set_ylabel('Sum of Squared Error (SSE)', fontsize=14)
ax1.grid(False) # Remove grid lines
# Evaluation metrics bar plot
ax2 = axes[1]
metric_scores = [SC, DBS, ARS, AMIS, time__]
metric_names = ['Silhouette Coefficient', 'Davies-Bouldin Score', 'Adj Rand Score', 'Adj Mutual Info Score',
'Time']
clusters = np.arange(len(c))
width = 0.15 # Adjust the width between each cluster's bar plot
for i in range(len(metric_scores)):
bars = ax2.bar(clusters + i * width, metric_scores[i], width)
for j, bar in enumerate(bars):
height = bar.get_height()
ax2.text(bar.get_x() + bar.get_width() / 2, height, f'{metric_scores[i][j]:.2f}', ha='center', va='bottom')
ax2.set_xlabel('Clusters', fontsize=14)
ax2.set_ylabel('Score', fontsize=14)
ax2.set_title(f'KMeans - Evaluation Metrics by Cluster\n({dataset_name} Dataset)', fontsize=14)
ax2.set_xticks(clusters)
ax2.set_xticklabels(c)
ax2.legend(metric_names)
ax2.grid(False) # Remove grid lines
plt.tight_layout()
plt.show()
return pd.DataFrame(dataframe)
def plot_silhouette(X, dataset_name):
# Range of cluster numbers to try
range_clusters = range(2, 6)
# Create subplots for silhouette plots
fig, axs = plt.subplots(1, 3, figsize=(16, 5))
axs = axs.flatten()
for n_clusters, ax in zip(range_clusters, axs):
# Create KMeans instance
clusterer = KMeans(n_clusters=n_clusters, random_state=42, init='k-means++')
cluster_labels = clusterer.fit_predict(X)
silhouette_avg__ = silhouette_score(X, cluster_labels)
# Compute the silhouette score for each sample
silhouette_avg = silhouette_samples(X, cluster_labels)
ax.set_xlim([-0.2, 1])
ax.set_ylim([0, len(X) + (n_clusters + 1) * 10])
y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to the cluster
ith_cluster_silhouette_values = silhouette_avg[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = plt.cm.nipy_spectral(float(i) / n_clusters)
ax.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers
ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
# The vertical line for average silhouette score of all the values
ax.axvline(x=silhouette_avg__, color="red", linestyle="--")
# ax.set_title("Silhouette plot for {} clusters".format(n_clusters))
ax.set_title(f'Silhouette Plot (k={n_clusters})\nSilhouette Score: {silhouette_avg__:.2f}\n({dataset_name} Dataset)')
ax.set_xlabel("Silhouette coefficient values")
ax.set_ylabel("Cluster label")
ax.grid(False) # Remove grid lines
# Show the plot
plt.tight_layout()
plt.show()
res_wdbc_kmeans_org = runKMeans(df_wdbc_std,y_wdbc, 'Wdbc')
res_wdbc_kmeans_org['Technique'] = 'KMeans'
res_seeds_kmeans_org = runKMeans(df_seed_std,y_seed, 'Seeds')
res_seeds_kmeans_org['Technique'] = 'KMeans'
plot_silhouette(df_wdbc_std, 'Wdbc')
plot_silhouette(df_seed_std, 'Seeds')
The elbow method can be a useful technique for finding the optimal number of clusters, it does have some limitations. For example, in some cases, the plot of WCSS as a function of k may not have a well-defined elbow point, which can make it difficult to determine the optimal number of clusters. Thus we will check other different metrics to find the optimal clusters as well.
Silhouette Coefficient: Silhouette Coefficient is better when it is closer to 1, indicating that data points are well matched to their own clusters and poorly matched to neighboring clusters.
Davies-Bouldin Score: Davies-Bouldin Score is better when it is lower, signifying a tighter and more well-defined clustering structure with minimal overlap.
Adjusted Rand Score: Adjusted Rand Score is better when it is closer to 1, implying a high agreement between true and predicted clusters while considering chance.
Adjusted Mutual Information Score: Adjusted Mutual Information Score is better when it is closer to 1, indicating a high amount of mutual information shared between true and predicted cluster assignments.
For the Wisconsin Breast Cancer Dataset, it is evident that having 2 clusters is the optimal choice. This conclusion is drawn from the fact that the clustering results exhibit the following characteristics:
Adjusted Rand Score:The Adjusted Rand Score is significantly closer to 1, indicating a high level of agreement between the true data labels and the predicted clusters. This suggests that a 2-cluster solution aligns well with the underlying structure of the data.
Silhouette Score: The Silhouette Score is notably higher, signifying that data points are well-matched to their respective clusters and less matched to neighboring clusters. This indicates that a 2-cluster configuration provides a clear and well-separated clustering structure.
Adjusted Mutual Information Score: The Adjusted Mutual Information Score is substantially closer to 1, implying a substantial amount of mutual information shared between the true class labels and the predicted clusters. This reinforces the effectiveness of a 2-cluster solution.
Davies-Bouldin Score: The Davies-Bouldin Score is notably lower, indicating minimal overlap and tighter clustering. This further supports the choice of 2 clusters as it results in well-separated and distinct clusters.
The silhouette sample plot is a valuable tool for assessing the quality and cohesion of clusters. In this case, it reaffirms the selection of 2 clusters as the optimal choice, as it results in a well-distributed clustering configuration. This not only aligns with the performance metrics discussed earlier but also provides a visual representation of the quality of the clustering solution.
In summary, the optimal clustering solution for the Wisconsin Breast Cancer Dataset appears to be 2 clusters, as it aligns with the data's inherent structure and exhibits superior performance across multiple evaluation metrics.
On the Seeds dataset, which contains three distinct ground truth labels, a KMeans clustering analysis was performed for different cluster configurations (ranging from 2 to 9 clusters). The following observations can be made based on the clustering results and evaluation metrics:
Elbow (WSSE):The WSSE (Within-Cluster Sum of Squares Error) was used to assess the variance within the clusters. It typically decreases as the number of clusters increases. In this case, the plot of WSSE shows an "elbow" point at around 3 or 4 clusters. Beyond this point, the reduction in WSSE is less significant. This suggests that 3 or 4 clusters might be a suitable choice, with 3 being the most favored choice as it aligns with the true number of classes.
Silhouette Coefficient:The Silhouette Coefficient measures how similar an object is to its cluster compared to other clusters. Higher values indicate better cluster quality. In this analysis, the Silhouette Coefficient is highest when using 2 clusters, indicating well-separated clusters. However, 3 clusters also show a respectable Silhouette score, suggesting reasonable separation.
Davies-Bouldin Score:The Davies-Bouldin Score evaluates the average similarity between each cluster with the one that is most similar to it. Lower values indicate better separation between clusters. The Davies-Bouldin Score is lowest for 4 clusters, implying the most distinct clusters. However, 3 clusters also achieve a low score.
Adjusted Rand Score:The Adjusted Rand Score assesses the similarity between true labels and cluster assignments. It is close to 1 when the two are in perfect agreement. In this case, the best agreement with the true labels is seen with 3 clusters, as it has the highest Adjusted Rand Score.
Adjusted Mutual Info Score:The Adjusted Mutual Info Score is another measure of the agreement between true labels and clustering results. Like the Adjusted Rand Score, the highest score is also observed when using 3 clusters.
In summary, considering all these metrics and that the true number of classes in the Seeds dataset is 3, the choice of either 3 or 4 clusters seems appropriate. The KMeans analysis with 3 clusters is particularly promising, as it aligns with the true number of classes and achieves high Silhouette, Davies-Bouldin, Adjusted Rand, and Adjusted Mutual Info scores.
kmeans1 = KMeans(n_clusters=2, random_state=42, init='k-means++')
kmeans1.fit(df_wdbc_std)
df_wdbc['KMeans_Clusters'] = kmeans1.labels_
kmeans2 = KMeans(n_clusters=3, random_state=42, init='k-means++')
kmeans2.fit(df_seed_std)
df_seed['KMeans_Clusters'] = kmeans2.labels_
def plot_clusters_distribution(df, cols='KMeans_Clusters'):
fig, axes = plt.subplots(1, 2, figsize=(9, 5))
df_wdbc, df_seed = df
# Plot for the WDBC dataset
sns.countplot(data=df_wdbc, x=cols, ax=axes[0])
axes[0].set_title('Cluster Distribution (WDBC)', fontsize=16)
axes[0].set_xlabel('Cluster', fontsize=16)
axes[0].set_ylabel('Count', fontsize=16)
# Plot for the Seeds dataset
sns.countplot(data=df_seed, x=cols, ax=axes[1])
axes[1].set_title('Cluster Distribution (Seeds)', fontsize=16)
axes[1].set_xlabel('Cluster', fontsize=16)
axes[1].set_ylabel('Count', fontsize=16)
for ax in axes:
for p in ax.patches:
ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', fontsize=12, color='black', xytext=(0, 5), textcoords='offset points')
axes[0].grid(False) # Remove grid lines
axes[1].grid(False) # Remove grid lines
plt.tight_layout()
plt.show()
plot_clusters_distribution([df_wdbc, df_seed], 'KMeans_Clusters')
from sklearn.mixture import GaussianMixture
def runGMM(data, y, dataset_name):
dataframe = []
bic = []
aic = []
SC = []
DBS = []
ARS = []
AMIS = []
time__ = []
c = range(2, 10) # iterate on each cluster
fig, axes = plt.subplots(1, 2, figsize=(18, 8)) # Two plots in each row
for i in c:
start = time.time()
gmm = GaussianMixture(n_components=i, random_state=42)
gmm.fit(data)
end = time.time() - start
y_pred = gmm.predict(data)
time__.append(end)
bic__ = gmm.bic(data)
bic.append(bic__)
aic__ = gmm.aic(data)
aic.append(aic__)
sc__ = silhouette_score(data, y_pred, metric='euclidean')
SC.append(sc__)
dbs__ = davies_bouldin_score(data, y_pred)
DBS.append(dbs__)
ars__ = adjusted_rand_score(y, y_pred)
amis__ = adjusted_mutual_info_score(y,y_pred)
AMIS.append(amis__)
ARS.append(ars__)
dataframe.append({
'Clustering': 'EM',
'Dataset': dataset_name,
'Clusters': i,
'BIC': bic__,
'AIC': aic__,
'Silhouette Coefficient': sc__,
'Davies-Bouldin Score': dbs__,
'Adjusted Rand Score': ars__,
'Adjusted Mutual Info Score': amis__,
'Time Taken': end
})
# Elbow plot
ax1 = axes[0]
ax1.plot(c, bic, marker='o', label='BIC')
ax1.plot(c, aic, marker='o', label='AIC')
ax1.set_title(f'EM - BIC and AIC ({dataset_name} Dataset)', fontsize=14)
ax1.set_xlabel('Number of components', fontsize=14)
ax1.set_ylabel('Score', fontsize=14)
ax1.legend()
ax1.grid(False) # Remove grid lines
# Evaluation metrics bar plot
ax2 = axes[1]
metric_scores = [SC, DBS, ARS, AMIS, time__]
metric_names = ['Silhouette Coefficient', 'Davies-Bouldin Score', 'Adj Rand Score', 'Adj Mutual Info Score',
'Time']
clusters = np.arange(len(c))
width = 0.15 # Adjust the width between each cluster's bar plot
for i in range(len(metric_scores)):
bars = ax2.bar(clusters + i * width, metric_scores[i], width)
for j, bar in enumerate(bars):
height = bar.get_height()
ax2.text(bar.get_x() + bar.get_width() / 2, height, f'{metric_scores[i][j]:.2f}', ha='center', va='bottom')
ax2.set_xlabel('Clusters', fontsize=14)
ax2.set_ylabel('Score', fontsize=14)
ax2.set_title(f'EM - Evaluation Metrics by Cluster\n({dataset_name} Dataset)', fontsize=14)
ax2.set_xticks(clusters)
ax2.set_xticklabels(c)
ax2.legend(metric_names)
ax2.grid(False) # Remove grid lines
plt.tight_layout()
plt.show()
return pd.DataFrame(dataframe)
def plot_silhouette_GMM(X, dataset_name):
# Range of cluster numbers to try
range_clusters = range(2, 6)
# Create subplots for silhouette plots
fig, axs = plt.subplots(1, 3, figsize=(16, 5))
axs = axs.flatten()
for n_clusters, ax in zip(range_clusters, axs):
# Create KMeans instance
clusterer = GaussianMixture(n_components=n_clusters, random_state=42)
cluster_labels = clusterer.fit_predict(X)
silhouette_avg__ = silhouette_score(X, cluster_labels)
# Compute the silhouette score for each sample
silhouette_avg = silhouette_samples(X, cluster_labels)
ax.set_xlim([-0.2, 1])
ax.set_ylim([0, len(X) + (n_clusters + 1) * 10])
y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to the cluster
ith_cluster_silhouette_values = silhouette_avg[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = plt.cm.nipy_spectral(float(i) / n_clusters)
ax.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers
ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
# The vertical line for average silhouette score of all the values
ax.axvline(x=silhouette_avg__, color="red", linestyle="--")
# ax.set_title("Silhouette plot for {} clusters".format(n_clusters))
ax.set_title(f'Silhouette Plot (k={n_clusters})\nSilhouette Score: {silhouette_avg__:.2f}\n({dataset_name} Dataset)')
ax.set_xlabel("Silhouette coefficient values")
ax.set_ylabel("Cluster label")
ax.grid(False) # Remove grid lines
# Show the plot
plt.tight_layout()
plt.show()
res_wdbc_gmm_org = runGMM(df_wdbc_std,y_wdbc, 'Wdbc')
res_wdbc_gmm_org['Technique'] = 'EM'
res_seeds_gmm_org = runGMM(df_seed_std,y_seed, 'Seeds')
res_seeds_gmm_org['Technique'] = 'EM'
plot_silhouette_GMM(df_wdbc_std, 'Wdbc')
plot_silhouette_GMM(df_seed_std, 'Seeds')
gmm1 = GaussianMixture(n_components=2, random_state=42)
label1 = gmm1.fit_predict(df_wdbc_std)
df_wdbc['GMM_Clusters'] = label1
gmm2 = GaussianMixture(n_components=3, random_state=42)
label2 = gmm2.fit_predict(df_seed_std)
df_seed['GMM_Clusters'] = label2
plot_clusters_distribution([df_wdbc, df_seed], 'GMM_Clusters')
from sklearn.decomposition import PCA
# Dimensionaly Reduction through PCA Wdbc
pca = PCA()
wdbc_pca_reduced = pca.fit_transform(df_wdbc_std)
# Dimensionaly Reduction through PCA Seed dataset
pca1 = PCA()
seed_pca_reduced = pca1.fit_transform(df_seed_std)
def plot_cumm_var(pca, dataset_name, threshold=None):
pc_list = ["PC"+str(i) for i in list(range(1, len(pca.explained_variance_ratio_.tolist())+1))]
pca_variance = np.cumsum(pca.explained_variance_ratio_.tolist())
pca_df = pd.DataFrame({'pca_feature':pc_list,'cummulative variance':pca_variance})
pca_df = pca_df[pca_df['cummulative variance']<=threshold]
pca_df['cummulative variance'] = round(pca_df['cummulative variance']*100,2)
# print(tabulate(pca_df, headers = 'keys', tablefmt = 'psql'))
plt.figure(figsize=(15,5))
plt.plot(pc_list,pca_variance,c='red',marker='X', linewidth=2,markersize=9)
if threshold:
# Draw line set threshold
value99 = pca_variance[-1]*threshold
plt.hlines(y = value99, xmin=0, xmax=len(pca.explained_variance_ratio_.tolist()), color = 'r', linestyles = 'dashed')
plt.title(f"PCA Variance Graph ({dataset_name} Dataset) - Total Best Feature {len(pca_df)}", fontsize=18)
plt.xlabel("PCs",fontsize=18)
plt.ylabel("Cummulative Variance",fontsize=18)
plt.xticks(rotation=90)
plt.grid(False) # Remove grid lines
plt.show()
# print("Final Total Feature We will use for PCA is {}".format(len(pca_df)))
def plot_clusters(df, pca, clustering, clustering_name, dataset_name):
# set image size
plt.figure(figsize=(8, 5))
# set a title
plt.title(f"{clustering_name} Clustering ({dataset_name} Dataset) ", fontdict={"fontsize": 16})
# set axes names
plt.xlabel("X0", fontdict={"fontsize": 16})
plt.ylabel("X1", fontdict={"fontsize": 16})
# create scatter plot with seaborn, where hue is the class used to group the data
sns.scatterplot(data=df, x='x0', y='x1', hue='cluster', palette="viridis")
centroids = clustering.cluster_centers_
centroids = pca.transform(centroids)
plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200,
c='r', label='centroid')
plt.xlabel("Component 1", fontsize=15)
plt.ylabel("Component 2", fontsize=15)
plt.legend(bbox_to_anchor=(1, 1))
plt.grid(False) # Remove grid lines
plt.show()
# lets Draw a horizontal line on the above plot to highlight the threshold of 95%.
plot_cumm_var(pca, 'Wdbc', threshold=0.9)
plot_cumm_var(pca1, 'Seeds', threshold=0.9)
For the Wisconsin Dataset:In the case of the Wisconsin dataset, it was found that a reduction to just 6 principal components (features) allows us to capture 90% of the variance in the data. This indicates that a substantial dimensionality reduction can be achieved while retaining most of the dataset's information.
For the Seeds Dataset:Conversely, when applied to the Seeds dataset, a mere 2 principal components (features) are sufficient to capture 90% of the dataset's variance.
pca = PCA(n_components=6)
# pass our X to the pca and store the reduced vectors into pca_vecs
data_pca = pca.fit_transform(df_wdbc_std)
# save our two dimensions into x0 and x1
x0 = data_pca[:, 0]
x1 = data_pca[:, 1]
# assign clusters and pca vectors to our dataframe
df_wdbc_std['cluster'] = kmeans1.labels_
df_wdbc_std['cluster_gmm'] = label1
# X_std_df['cluster'] = X_std_df['cluster'].map({0:'cluster_0', 1:'cluster_1'})
df_wdbc_std['x0'] = x0
df_wdbc_std['x1'] = x1
res_wdbc_kmeans_pca = runKMeans(data_pca,y_wdbc, 'Wdbc')
res_wdbc_kmeans_pca['Technique'] = 'PCA'
plot_silhouette(data_pca, 'Wdbc')
# Dimensionaly Reduction through PCA Seed dataset
pca1 = PCA(n_components=2)
data_pca2 = pca1.fit_transform(df_seed_std)
# save our two dimensions into x0 and x1
x0 = data_pca2[:, 0]
x1 = data_pca2[:, 1]
# assign clusters and pca vectors to our dataframe
df_seed_std['cluster'] = kmeans2.labels_
df_seed_std['cluster_gmm'] = label2
df_seed_std['x0'] = x0
df_seed_std['x1'] = x1
res_seeds_kmeans_pca = runKMeans(data_pca2,y_seed, 'Seeds')
res_seeds_kmeans_pca['Technique'] = 'PCA'
plot_silhouette(data_pca2, 'Seeds')
plot_clusters(df_wdbc_std,pca, kmeans1, 'KMeans', 'Wdbc')
plot_clusters(df_seed_std,pca1, kmeans2, 'KMeans', 'Seeds')
res_wdbc_gmm_pca = runGMM(data_pca,y_wdbc, 'Wdbc')
res_wdbc_gmm_pca['Technique'] = 'PCA'
plot_silhouette_GMM(data_pca, 'Wdbc')
res_seeds_gmm_pca = runGMM(data_pca2,y_seed, 'Seeds')
res_seeds_gmm_pca['Technique'] = 'PCA'
plot_silhouette_GMM(data_pca2, 'Seeds')
def plot_clusters_GMM(df, pca, clustering_name, dataset_name):
# set image size
plt.figure(figsize=(8, 5))
# set a title
plt.title(f"{clustering_name} Clustering ({dataset_name} Dataset) ", fontdict={"fontsize": 16})
# set axes names
plt.xlabel("X0", fontdict={"fontsize": 16})
plt.ylabel("X1", fontdict={"fontsize": 16})
sns.scatterplot(data=df, x='x0', y='x1', hue='cluster_gmm', palette="viridis")
plt.xlabel("Component 1", fontsize=15)
plt.ylabel("Component 2", fontsize=15)
plt.legend(bbox_to_anchor=(1, 1))
plt.grid(False) # Remove grid lines
plt.show()
plot_clusters_GMM(df_wdbc_std, pca, 'GMM', 'Wdbc')
plot_clusters_GMM(df_seed_std, pca1, 'GMM', 'Seeds')
For the Wisconsin Dataset:
For the Seeds Dataset:
In both cases, the alignment between the number of principal components and the optimal number of clusters suggests that PCA has successfully reduced the data's dimensionality while preserving the underlying structure, facilitating more interpretable and effective clustering results.
from sklearn.decomposition import FastICA
from scipy.stats import kurtosis
ica = FastICA(random_state=42)
S_ = ica.fit_transform(df_wdbc_std)
ica1 = FastICA(random_state=42)
S__ = ica1.fit_transform(df_seed_std)
def plot_ICA_check_Components(df,ica_data, dataset_name):
# Compute kurtosis for each component
kurtosis_values = kurtosis(ica_data)
# Sort the kurtosis values in descending order
sorted_kurtosis = np.sort(kurtosis_values)[::-1]
# Calculate the cumulative variance explained by each component
cumulative_kurtosis = np.cumsum(sorted_kurtosis) / np.sum(sorted_kurtosis)
# Scree plot
explained_variance_ratio = np.var(ica_data, axis=0) / np.var(df.values, axis=0)
cumulative_explained_variance = np.cumsum(explained_variance_ratio)
# Information criteria (AIC and BIC)
n_components = np.arange(1, len(kurtosis_values) + 1)
aic = []
bic = []
for n in n_components:
aic_value = 2 * n - 2 * np.sum(np.log(np.abs(kurtosis_values[:n])))
bic_value = n * np.log(df.shape[1]) - 2 * np.sum(np.log(np.abs(kurtosis_values[:n])))
aic.append(aic_value)
bic.append(bic_value)
# Plotting
plt.figure(figsize=(12, 5))
# Scree plot
plt.subplot(1, 3, 1)
plt.plot(n_components, cumulative_explained_variance, marker='o')
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
plt.title(f"Scree Plot\n{dataset_name} Dataset")
plt.grid(False) # Remove grid lines
# Kurtosis
plt.subplot(1, 3, 2)
plt.plot(range(1, len(sorted_kurtosis) + 1), cumulative_kurtosis, marker='o')
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Kurtosis")
plt.title(f"Cumulative Kurtosis\n{dataset_name} Dataset")
plt.grid(False) # Remove grid lines
# AIC and BIC
plt.subplot(1, 3, 3)
plt.plot(n_components, aic, marker='o', label='AIC')
plt.plot(n_components, bic, marker='o', label='BIC')
plt.xlabel("Number of Components")
plt.ylabel("Information Criteria Value")
plt.title(f"Information Criteria (AIC and BIC)\n{dataset_name} Dataset")
plt.grid(False) # Remove grid lines
plt.legend()
plt.tight_layout()
plt.show()
plot_ICA_check_Components(df_wdbc_std, S_, 'Wdbc')
plot_ICA_check_Components(df_seed_std, S__, 'Seeds')
Based on kurtosis analysis/Scree Plot/BIC:
For the Wisconsin dataset, it is recommended to choose 10 components using kurtosis analysis. This number appears to capture the most non-Gaussian and independent features in the dataset, making it suitable for dimensionality reduction or further analysis.
For the Seeds dataset, the optimal number of components, as determined by kurtosis analysis, is 4. These 4 components are likely to capture the essential non-Gaussian and independent characteristics of the data, allowing for effective feature reduction or analysis.
ica = FastICA(n_components=10, random_state=42)
W_ = ica.fit_transform(df_wdbc_std)
ica1 = FastICA(n_components=4,random_state=42)
S_ = ica1.fit_transform(df_seed_std)
data_pca.shape
res_wdbc_kmeans_ica = runKMeans(W_,y_wdbc, 'Wdbc')
res_wdbc_kmeans_ica['Technique'] = 'ICA'
plot_silhouette(W_, 'Wdbc')
res_seed_kmeans_ica = runKMeans(S_,y_seed, 'Seeds')
res_seed_kmeans_ica['Technique'] = 'ICA'
plot_silhouette(S_, 'Seeds')
res_wdbc_gmm_ica = runGMM(W_,y_wdbc, 'Wdbc')
res_wdbc_gmm_ica['Technique'] = "ICA"
plot_silhouette_GMM(W_, 'Wdbc')
res_seeds_gmm_ica = runGMM(S_,y_seed, 'Seeds')
res_seeds_gmm_ica['Technique'] = "ICA"
plot_silhouette_GMM(S_, 'Seeds')
from sklearn.random_projection import GaussianRandomProjection
from sklearn.metrics import mean_squared_error
def plot_RP_check_Components(X, dataset_name):
recons_err = []
fit_time = []
# max components = no features
comp = range(1, X.shape[1])
for i in comp:
grp = GaussianRandomProjection(n_components = i, random_state=42)
grp.fit(X)
# Calculate recons err
trans = grp.transform(X)
pinv = np.linalg.pinv(np.transpose(grp.components_))
X_recons = np.dot(trans, pinv)
X_recons = pd.DataFrame(data=X_recons, columns=X.columns)
rmse = np.sqrt(mean_squared_error(X, X_recons))
recons_err.append(rmse)
# print(recons_err)
plt.figure(figsize=(6,5))
# Plot ICA
plt.plot(comp,recons_err, marker='o')
plt.title(f'Randomized Projections - {dataset_name} Dataset')
plt.xlabel('Number of components')
plt.ylabel('Reconstruction error (RMSE)')
#plt.legend(loc='best')
plt.grid()
plt.show()
plot_RP_check_Components(df_wdbc_std, 'Wdbc')
plot_RP_check_Components(df_seed_std, 'Seeds')
For the Wisconsin dataset, where you analyzed the reconstruction error (RMSE), the optimal number of components appears to be around 10. This is the point at which the reconstruction error starts to level off, indicating that further components do not significantly contribute to improving the reconstruction. Choosing 10 components is a reasonable choice for this dataset.
For the Seeds dataset, the analysis of the reconstruction error suggests that the optimal number of components is around 3. At this point, the reconstruction error starts to level off, indicating that you can achieve dimensionality reduction with minimal loss of information by retaining 3 components.
rp = GaussianRandomProjection(n_components=10, random_state=42)
W_RP = rp.fit_transform(df_wdbc_std)
rp1 = GaussianRandomProjection(n_components=3,random_state=42)
S_RP = rp1.fit_transform(df_seed_std)
res_wdbc_kmeans_rp = runKMeans(W_RP,y_wdbc, 'Wdbc')
res_wdbc_kmeans_rp['Technique'] = 'RP'
plot_silhouette(W_RP, 'Wdbc')
res_seeds_kmeans_rp = runKMeans(S_RP,y_seed, 'Seeds')
res_seeds_kmeans_rp['Technique'] = 'RP'
plot_silhouette(S_RP, 'Seeds')
res_wdbc_gmm_rp = runGMM(W_RP,y_wdbc, 'Wdbc')
res_wdbc_gmm_rp['Technique'] = 'RP'
plot_silhouette_GMM(W_RP, 'Wdbc')
res_seeds_gmm_rp = runGMM(S_RP,y_seed, 'Seeds')
res_seeds_gmm_rp['Technique'] = 'RP'
plot_silhouette_GMM(S_RP, 'Seeds')
from sklearn.ensemble import RandomForestClassifier
def plot_RF_Feature_Importance(X,y, dataset_name):
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=0)
# Fit the classifier to your data
rf_classifier.fit(X, y)
# Get feature importances from the model
feature_importances = rf_classifier.feature_importances_
# Get feature names
feature_names = X.columns
# Sort features by importance
sorted_indices = np.argsort(feature_importances)
# Plot the feature importance
plt.figure(figsize=(7, 6))
plt.barh(range(len(feature_importances)), feature_importances[sorted_indices], align="center")
plt.yticks(range(len(feature_importances)), np.array(feature_names)[sorted_indices])
plt.xlabel("Feature Importance")
plt.title(f"Random Forest Feature Importance ({dataset_name}) Dataset")
plt.grid(False)
plt.show()
return np.array(feature_names)[sorted_indices][::-1]
top_features_wdbc = plot_RF_Feature_Importance(df_wdbc_std_org,y_wdbc, 'Wdbc')
top_features_seed = plot_RF_Feature_Importance(df_seed_std_org,y_seed, 'Seeds')
Certainly, we can use feature selection through a Random Forest classifier to identify the most important features for each dataset. Based on the feature importance scores, we can select a subset of the most relevant features.
For the Wisconsin dataset, we select the first 5 features with the highest feature importance. These features are 'Worst Perimeter', 'Worst Radius', 'Worst Concave Points', 'Mean Concave Points', and 'Worst Area':
For the Seeds dataset, we select the first 3 features with the highest feature importance. These features are 'Length of Kernel Groove', 'Perimeter', and 'Area':
selected_features_wisconsin = ['Worst Perimeter', 'Worst Radius', 'Worst Concave Points', 'Mean Concave Points', 'Worst Area']
selected_features_seeds = ['Length of Kernel Groove', 'Perimeter', 'Area']
df_wdbc_std_x = df_wdbc_std_org[selected_features_wisconsin]
df_seed_std_x = df_seed_std_org[selected_features_seeds]
res_wdbc_kmeans_rf = runKMeans(df_wdbc_std_x,y_wdbc, 'Wdbc')
res_wdbc_kmeans_rf['Technique'] = 'RF'
plot_silhouette(df_wdbc_std_x, 'Wdbc')
res_seeds_kmeans_rf = runKMeans(df_seed_std_x,y_seed, 'Seeds')
res_seeds_kmeans_rf['Technique'] = 'RF'
plot_silhouette(df_seed_std_x, 'Seeds')
res_wdbc_gmm_rf = runGMM(df_wdbc_std_x,y_wdbc, 'Wdbc')
res_wdbc_gmm_rf['Technique'] = 'RF'
plot_silhouette_GMM(df_wdbc_std_x, 'Wdbc')
res_seeds_gmm_rf = runGMM(df_seed_std_x,y_seed, 'Seeds')
res_seeds_gmm_rf['Technique'] = 'RF'
plot_silhouette_GMM(df_seed_std_x, 'Seeds')
df_wdbc_kmeans = pd.concat([res_wdbc_kmeans_org, res_wdbc_kmeans_pca, res_wdbc_kmeans_ica, res_wdbc_kmeans_rp, res_wdbc_kmeans_rf])
df_wdbc_gmm = pd.concat([res_wdbc_gmm_org, res_wdbc_gmm_pca, res_wdbc_gmm_ica, res_wdbc_gmm_rp, res_wdbc_gmm_rf])
def plot_evaluation_metrics_on_all_technique(df, clustering, dataset_name):
# Create subplots for each scoring metric
plt.figure(figsize=(15, 10))
# Define the scoring metrics to be plotted
scoring_metrics = ["Adjusted Rand Score","Adjusted Mutual Info Score", "Davies-Bouldin Score", "Silhouette Coefficient"]
for i, metric in enumerate(scoring_metrics, start=1):
plt.subplot(2, 2, i)
for name, group in df.groupby('Technique'):
plt.plot(group['Clusters'], group[metric], marker='o', label=name)
plt.xlabel('Number of Clusters')
plt.ylabel(metric)
plt.title(f'{metric} vs. Number of Clusters')
plt.legend(title='Technique', loc='upper right', bbox_to_anchor=(1.3, 1))
plt.grid(False)
# Add the subtitle
plt.suptitle(f"Results on {clustering} ({dataset_name} Dataset)", fontsize=16, y=1.02)
plt.tight_layout()
plt.show()
plot_evaluation_metrics_on_all_technique(df_wdbc_kmeans, "KMeans", "Wdbc")
plot_evaluation_metrics_on_all_technique(df_wdbc_gmm, "EM", "Wdbc")
df_seeds_kmeans = pd.concat([res_seeds_kmeans_org, res_seeds_kmeans_pca, res_seed_kmeans_ica, res_seeds_kmeans_rp, res_seeds_kmeans_rf])
df_seeds_gmm = pd.concat([res_seeds_gmm_org, res_seeds_gmm_pca, res_seeds_gmm_ica, res_seeds_gmm_rp, res_seeds_gmm_rf])
plot_evaluation_metrics_on_all_technique(df_seeds_kmeans, "KMeans", "Seeds")
plot_evaluation_metrics_on_all_technique(df_seeds_gmm, "EM", "Seeds")
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
def splitting_dataset(X, y, test_size=0.20):
from sklearn.model_selection import train_test_split
assert X.shape[0] == y.shape[0], "Number of samples in X and y should be the same."
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0, stratify=y)
# Check the shape of training and testing sets
# print("Training shape:", X_train.shape)
# print("Testing shape:", X_test.shape)
return X_train, X_test, y_train, y_test
def train_NN(df,y, technique_name):
# lets split the dataset 80% for training and 20% for testing
X_train, X_test, y_train, y_test = splitting_dataset(df, y , test_size=0.2)
model = MLPClassifier()
model.fit(X_train, y_train)
# Perform cross-validation with 5-fold cross-validation
results = cross_validate(model, X_test, y_test, cv=5, scoring='accuracy', return_train_score=True)
df__ = pd.DataFrame([{
'Technique':f"{technique_name}",
f'KFold-Train Acc':results['train_score'].mean(),
f'KFold-Test Acc':results['test_score'].mean()
}])
return df__
df_res_PCA = train_NN(data_pca, y_wdbc, 'PCA')
df_res_ICA = train_NN(W_, y_wdbc, 'ICA')
df_res_RP = train_NN(W_RP, y_wdbc, 'RP')
df_res_RF = train_NN(df_wdbc_std_x, y_wdbc, 'RF')
df_RES = pd.concat([df_res_PCA, df_res_ICA, df_res_RP, df_res_RF])
df_RES
# Plot test accuracy for each technique as a line plot
plt.figure(figsize=(5, 5))
plt.plot(df_RES["Technique"], df_RES["KFold-Test Acc"], marker='o', linestyle='-')
plt.xlabel('Technique')
plt.ylabel('KFold-Test Accuracy')
plt.title(f'NN - Wisconsin Dataset')
plt.grid(False)
plt.show()
df_res_PCA1 = train_NN(data_pca, df_wdbc['KMeans_Clusters'], 'PCA')
df_res_ICA1 = train_NN(W_, df_wdbc['KMeans_Clusters'], 'ICA')
df_res_RP1 = train_NN(W_RP, df_wdbc['KMeans_Clusters'], 'RP')
df_res_RF1 = train_NN(df_wdbc_std_x, df_wdbc['KMeans_Clusters'], 'RF')
df_RES1 = pd.concat([df_res_PCA1, df_res_ICA1, df_res_RP1, df_res_RF1])
df_RES1['Comment'] = 'Use KMeans Labels'
df_RES1
df_res_PCA2 = train_NN(data_pca, df_wdbc['GMM_Clusters'], 'PCA')
df_res_ICA2 = train_NN(W_, df_wdbc['GMM_Clusters'], 'ICA')
df_res_RP2 = train_NN(W_RP, df_wdbc['GMM_Clusters'], 'RP')
df_res_RF2 = train_NN(df_wdbc_std_x, df_wdbc['GMM_Clusters'], 'RF')
df_RES2 = pd.concat([df_res_PCA2, df_res_ICA2, df_res_RP2, df_res_RF2])
df_RES2['Comment'] = 'Use EM Labels'
df_RES2
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
# Plot for "Use KMeans Labels"
axes[0].bar(df_RES1["Technique"], df_RES1["KFold-Test Acc"], color='b', alpha=0.7)
axes[0].set_title('Test Accuracy (Use KMeans Labels)')
axes[0].set_ylim(0.7, 1.0) # Adjust the y-axis limits if necessary
axes[0].grid(False)
# Plot for "Use EM Labels"
axes[1].bar(df_RES2["Technique"], df_RES2["KFold-Test Acc"], color='g', alpha=0.7)
axes[1].set_title('Test Accuracy (Use EM Labels)')
axes[1].set_ylim(0.7, 1.0) # Adjust the y-axis limits if necessary
axes[1].grid(False)
plt.tight_layout()
plt.show()